In [1]:
# Settings

# See "./sample input files" for example - solr / lucene stop word file format (one word per line, text file)
STOP_WORDS_FILE = "/Users/simon.hughes/Software/Solr/solr-5.1.0/server/solr/DiceJobs/conf/dice_stop_words.txt"

# keywords generated in step 2, or manually configured from your search logs
# this is expected to be in the format of a solr / lucene / ES synonym file. The python code mimics the solr
# analysis chain logic, and can be used to filter the text to just those words and phrases in the keyword file
# See "./sample input files" for examples
KEY_WORD_FILES = ["/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt", 
                  # this is a file we generated from our search log analysis. The file above was generated in the previous step
                  # only one file is needed, it depends what keywords / phrases are important to your domain
                  "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"]

DOCS_FOLDER  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs"
MODEL_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"
FILE_MASK = ".*\.txt"
MIN_SENT_LENGTH = 5

# W2Vec settings
MIN_WD_COUNT = 10 #for word2vec model # setting to 10 seems to remove some of the noise
WINDOW_SIZE  = 5
VECTOR_SIZE  = 300
WORKERS = 8
TRAINING_ITERATIONS = 15

In [2]:
# Shared
import re
from collections import defaultdict

def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

re_collapse_spaces = re.compile("\s+")
def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s)

re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
    s = str(s).replace("'s"," ")
    #doesn't work in regex
    s = s.replace("-", " ").replace("\\"," ")
    s = re1.sub(" ",s).strip()
    return collapse_spaces(s)

def find_files(folder, regex, remove_empty = False):
    """
    Find all files matching the [regex] pattern in [folder]

    folder  :   string
                    folder to search (not recursive)
    regex   :   string (NOT regex object)
                    pattern to match
    """
    files = os.listdir(folder)
    matches = [os.path.abspath(os.path.join(folder, f))
               for f in files
               if re.search(regex, f, re.IGNORECASE)]

    if remove_empty:
        matches = [f for f in matches if os.path.getsize(f) > 0]
    matches.sort()
    return matches

In [3]:
from collections import defaultdict

class SynonymMapper(object):
    def __init__(self, mapper, nested, case_sensitive=False):
        self.case_sensitive = case_sensitive
        self.mapper = mapper
        self.nested = nested
        self.synonyms = set()
        for rhs in self.mapper.values():
            for syn in rhs:
                self.synonyms.add(syn)
        
    def is_synonym(self, term):
        return term in self.synonyms
        
    def map_synonyms(self, tokens, debug=False):
        mapped = []
        size = len(tokens)
        if not self.case_sensitive:
            tmp_tokens = map(lambda s: s.lower(), tokens)
        else:
            tmp_tokens = tokens
        ix = 0
        while ix < size:
            if debug:
                print "ix", ix
            best, best_key = None, None
            tmp_ix = ix        
            max_ix = ix
            current = ""
            d = self.nested
            while tmp_ix < size and tmp_tokens[tmp_ix] in d:
                current += tmp_tokens[tmp_ix] + " "
                key = current.strip()
                if key in self.mapper:
                    if debug:
                        if best is not None:
                            print(ix, tmp_ix, "new best:", key, "=>", self.mapper[key])
                        else:
                            print(ix, tmp_ix, "best:", key, "=>", self.mapper[key])
                    best = self.mapper[key]
                    best_key = key
                    max_ix = tmp_ix                    
                d = d[tmp_tokens[tmp_ix]]
                tmp_ix += 1
            if not best:
                #retain original casing
                mapped.append(tokens[ix])
            else:
                ix = max_ix
                #yields a set
                for item in sorted(best):
                    mapped.append(item)
            ix += 1
        return mapped

    def __repr__(self):
        return "Synonym Mapper: %i synonyms mapped" % len(self.mapper)

def build_synonym_filter(files, case_sensitive=False):
    # recursively define a defaultdict generator
    mapper = defaultdict(set)
    def dd():
        return defaultdict(dd)
    nested_map = defaultdict(dd)
    file_locn = dict()
    if type(files) == str:
        files = [files]
    for f in files:
        with open(f, "r+") as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 0 and not line[0] == "#":
                    if "=>" in line:
                        left, right = line.split("=>")
                        right = set(right.split(","))
                        left_parts = left.split(",")
                    else:
                        left_parts = line.split(",")
                        right = set(left_parts)

                    for syn in left_parts:
                        for rhs in right:
                            mapper[syn].add(rhs)
                        file_locn[syn] = f

                        tokens = syn.split(" ")
                        prev = tokens[0]
                        d = nested_map[prev]
                        for token in tokens[1:]:
                            d = d[token]
                            prev = token                        
    return SynonymMapper(mapper, nested_map, case_sensitive)

In [4]:
#String processing
def white_space_tokenize(s):
    return s.split(" ")

__punct__ = set(".?!,;:")
def remove_punct_at_end(s):
    while len(s) > 1 and s[-1] in __punct__:
        s = s[:-1]
    return s

#Token Filters
def fact_len_filter(max_len):
    def len_filter(tokens):
        return filter(lambda s: len(s) >= max_len, tokens)
    return len_filter

remove_empty_tokens_filter = fact_len_filter(1)

def lower_case_filter(tokens):
    if type(tokens) == str:
        return tokens.lower()
    return map(lambda t: t.lower(), tokens)

__punct__ = set(".?!,;:")

def remove_punct_at_end_filter(tokens):
    return map(remove_punct_at_end, tokens)

def fact_is_synonym_filter(syn_mapper):
    def is_synonym_filter(tokens):
        return filter(syn_mapper.is_synonym, tokens)
    return is_synonym_filter

In [5]:
def fact_stop_word_filter(case_sensitive, stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
        for line in f:
            word = line.strip()
            if word[0] != "#":
                if not case_sensitive:
                    word = word.lower()
                stop_words.add(word)

    def cs_stop_filter(tokens):
        return [tok for tok in tokens if tok not in stop_words]

    def stop_filter(tokens):
        return [tok for tok in tokens if tok.lower() not in stop_words]

    if case_sensitive:
        return cs_stop_filter
    else:
        return stop_filter

stop_filter = fact_stop_word_filter(False, STOP_WORDS_FILE)

In [6]:
def analyze(s, filters):
    temp = s
    for f in filters:
        temp = f(temp)
    return temp

def debug_analyze(s, filters):
    temp = s
    pad = 20
    print "START".ljust(pad), temp
    for f in filters:
        temp = f(temp)
        if type(temp) == list:
            s_temp = "|".join(map(str,temp))
        else:
            s_temp = str(temp)
        print f.func_name.ljust(pad), s_temp
    return temp

In [7]:
syn_mapper = build_synonym_filter(KEY_WORD_FILES, False)
syn_mapper


Out[7]:
Synonym Mapper: 20123 synonyms mapped

In [8]:
#Skills from text
is_a_synonym_filter = fact_is_synonym_filter(syn_mapper)
analysis_chain = [clean_str,
                  white_space_tokenize,
                  remove_punct_at_end_filter,
                  lower_case_filter,
                  stop_filter,
                  syn_mapper.map_synonyms, 
                  remove_empty_tokens_filter]
                  # is_a_synonym_filter] - Un-comment to just train on keywords. 
                  #                      - Best to train on all words, and then filter learned synonyms to keywords
        
                    

#Test
rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management", 
                     analysis_chain)


START                $150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT "HOT" dev. -IBM's business, sql server management
clean_str            $150k as400 Sr. Java j2ee and the C#.! developer. FIT HOT dev. IBM business sql server management
white_space_tokenize $150k|as400|Sr.|Java|j2ee|and|the|C#.!|developer.|FIT|HOT|dev.|IBM|business|sql|server|management
remove_punct_at_end_filter $150k|as400|Sr|Java|j2ee|and|the|C#|developer|FIT|HOT|dev|IBM|business|sql|server|management
lower_case_filter    $150k|as400|sr|java|j2ee|and|the|c#|developer|fit|hot|dev|ibm|business|sql|server|management
stop_filter          $150k|as400|sr|java|j2ee|c#|developer|fit|dev|ibm|business|sql|server|management
map_synonyms         $150k|as400|sr java|j2ee|c# developer|fit|dev|ibm|business|sql server management
len_filter           $150k|as400|sr java|j2ee|c# developer|fit|dev|ibm|business|sql server management

In [9]:
import os, re, time
start = time.time()

sentences = []
files = find_files(DOCS_FOLDER, FILE_MASK, True)
print("%s files found in %s" % (len(files), DOCS_FOLDER))

documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        sentences.extend(contents.split("\n"))
end = time.time()
print("Loading %i sentences took %s seconds" % (len(sentences), str(end - start)))


66989 files found in /Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs
Loading 2376615 sentences took 15.9088010788 seconds

In [10]:
print len(sentences)
tokenized = []
print("Tokenizing sentences")
for i, sent in enumerate(sentences):
    tokens = analyze(sent, analysis_chain)
    if len(tokens) >= MIN_SENT_LENGTH:
        tokenized.append(tokens)
    if i % 100000 == 0:
        print(i)


2376615
Tokenizing sentences
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000

Train Model


In [ ]:
import gensim, time
from gensim.models.word2vec import Word2Vec

start = time.time()

print("Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee")
model = Word2Vec(tokenized, iter=TRAINING_ITERATIONS, size=VECTOR_SIZE, window=WINDOW_SIZE, min_count=MIN_WD_COUNT, workers=WORKERS, sample=1e-5, hs=0, negative=20)
model.save(MODEL_FILE)
end = time.time()
print "Took %s seconds" % (end - start)

In [ ]:
#find the top n similar terms as below:
#model.most_similar(positive="hadoop developer",topn=10)